{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import xport\n",
    "import pandas as pd\n",
    "import csv\n",
    "import numpy as np\n",
    "import re\n",
    "import elasticsearch\n",
    "import json\n",
    "import pprint as pprint\n",
    "\n",
    "es = elasticsearch.Elasticsearch()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Import data and read into a dataframe\n",
    "f = open('./LLCP2013.ASC', encoding='iso-8859-1')\n",
    "cdc = f.read().splitlines()\n",
    "t = pd.DataFrame({'Var': cdc})\n",
    "\n",
    "# Data references: \n",
    "# - Data: http://www.cdc.gov/brfss/annual_data/2013/files/LLCP2013ASC.ZIP\n",
    "# - Data Codebook: http://www.cdc.gov/brfss/annual_data/2013/pdf/codebook13_llcp.pdf\n",
    "# - Variable layout: http://www.cdc.gov/brfss/annual_data/2013/llcp_varlayout_13_onecolumn.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Each row in BRFSS data file correspondents to a respondent. The response to 321 questions is coded in\n",
    "# a single 2365 character long numeric string. The variable_list.csv file contains a maps the column number\n",
    "# to fields. For example, column 18-19 is a 2-digit code for the interview month\n",
    "var = pd.read_csv('./variable_list.csv')\n",
    "\n",
    "# We will only be looking at a subset of the columns in this analysis - these columns have been coded with a \n",
    "# Keep = Yes value in the variable list. \n",
    "varKeep = var[var['Keep'] == 'Yes']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Decode the numeric response into feature.\n",
    "for i, row in varKeep.iterrows():\n",
    "    st = row['Starting Column'] - 1\n",
    "    en = st + row['Field Length']\n",
    "    #print(st, en)\n",
    "    t[row['Variable Name']] = t['Var'].map(lambda x: x[st:en])\n",
    "    #print(row['Variable Name'])\n",
    "    \n",
    "# Create deep copy of variable\n",
    "t1 = t.copy(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "def str_to_iso(text):\n",
    "    if text != '':\n",
    "        for fmt in (['%m%d%Y','%d%m%Y']):\n",
    "            try:\n",
    "                return datetime.isoformat(datetime.strptime(text, fmt))\n",
    "            except ValueError:\n",
    "                if text == '02292014':\n",
    "                    return datetime.isoformat(datetime.strptime('02282014', fmt))\n",
    "                elif text == '09312014':\n",
    "                    return datetime.isoformat(datetime.strptime('09302014', fmt))\n",
    "                print(text)\n",
    "                pass        \n",
    "                raise ValueError('Changing date')\n",
    "    else:\n",
    "        \n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# id to state map\n",
    "st = pd.read_csv('./State.csv')\n",
    "\n",
    "# Convert state value from string to int\n",
    "t1['_STATE'] = t1['_STATE'].map(lambda x: int(x))\n",
    "\n",
    "# Map numeric value of stateto state name\n",
    "st1 = st[['ID', 'State']].set_index('ID').to_dict('dict')['State']\n",
    "t1['_STATE'] = t1['_STATE'].replace(st1)\n",
    "\n",
    "# Grab avg coordinates for state\n",
    "lat = st.set_index('State')[['Latitude']].to_dict()['Latitude']\n",
    "lon = st.set_index('State')[['Longitude']].to_dict()['Longitude']\n",
    "t1['Latitude'] = t1['_STATE'].replace(lat)\n",
    "t1['Longitude'] = t1['_STATE'].replace(lon)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Convert interview date values into numeric\n",
    "t1['FMONTH'] = t1['FMONTH'].map(lambda x: int(x))\n",
    "t1['IMONTH'] = t1['IMONTH'].map(lambda x: int(x))\n",
    "t1['IDAY'] = t1['IDAY'].map(lambda x: int(x))\n",
    "t1['IDATE'] = t1['IDATE'].map(lambda x: str_to_iso(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Alcohol consumption\n",
    "\n",
    "t1['AVEDRNK2'] = t1['AVEDRNK2'].map(lambda x: int(x) if not str.isspace(x) else None) # drinks per occasion\n",
    "t1['DRNK3GE5'] = t1['DRNK3GE5'].map(lambda x: int(x) if not str.isspace(x) else None) # binge days\n",
    "t1['MAXDRNKS'] = t1['MAXDRNKS'].map(lambda x: int(x) if not str.isspace(x) else None) # max drinks per occasion in last 30 days\n",
    "t1['_DRNKDY4'] = t1['_DRNKDY4'].map(lambda x: int(x) if not str.isspace(x) else None) # drinks/day\n",
    "t1['_DRNKMO4'] = t1['_DRNKMO4'].map(lambda x: int(x) if not str.isspace(x) else None) # drinks/month\n",
    "t1['DROCDY3_'] = t1['DROCDY3_'].map(lambda x: int(x) if not str.isspace(x) else None) # drink occasions in last 30 days\n",
    "\n",
    "choice = {'1':'No', '2':'Yes', '9': 'Missing'}\n",
    "t1['_RFBING5'] = t1['_RFBING5'].replace(choice) #  binge drinker?\n",
    "\n",
    "choice = {'1':'Yes', '2':'No', '7':'Don\\'t know' , '9': 'Refused'}\n",
    "t1['DRNKANY5'] = t1['DRNKANY5'].replace(choice) # any drinks in last 30 days?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Activity & exercise\n",
    "# Refer to the codebook ( http://www.cdc.gov/brfss/annual_data/2013/pdf/codebook13_llcp.pdf) for variable meaning\n",
    "\n",
    "t1['METVL11_'] = t1['METVL11_'].map(lambda x: int(x) if not str.isspace(x) else None)/10 \n",
    "t1['METVL21_'] = t1['METVL21_'].map(lambda x: int(x) if not str.isspace(x) else None)/10 \n",
    "t1['MAXVO2_'] = t1['MAXVO2_'].map(lambda x: int(x) if not str.isspace(x) else None) / 100 \n",
    "t1['FC60_'] = t1['FC60_'].map(lambda x: int(x) if not str.isspace(x) else None) / 100 \n",
    "t1['PADUR1_'] = t1['PADUR1_'].map(lambda x: int(x) if not str.isspace(x) else None) \n",
    "t1['PADUR2_'] = t1['PADUR2_'].map(lambda x: int(x) if not str.isspace(x) else None) \n",
    "t1['PAFREQ1_'] = t1['PAFREQ1_'].map(lambda x: int(x) if not str.isspace(x) else None) / 1000\n",
    "t1['PAFREQ2_'] = t1['PAFREQ2_'].map(lambda x: int(x) if not str.isspace(x) else None) / 1000\n",
    "t1['STRFREQ_'] = t1['STRFREQ_'].map(lambda x: int(x) if not str.isspace(x) else None) / 1000\n",
    "t1['PAMIN11_'] = t1['PAMIN11_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['PAMIN21_'] = t1['PAMIN21_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['PA1MIN_'] = t1['PA1MIN_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['PAVIG11_'] = t1['PAVIG11_'].map(lambda x: int(x) if not str.isspace(x) else None) \n",
    "t1['PAVIG21_'] = t1['PAVIG21_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['PA1VIGM_'] = t1['PA1VIGM_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['EXERHMM1'] = t1['EXERHMM1'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['EXERHMM2'] = t1['EXERHMM2'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "\n",
    "#t1['EXEROFT1'] = t1['EXEROFT1'].map(lambda x: exerFcn(x))\n",
    "#t1['EXEROFT2'] = t1['EXEROFT2'].map(lambda x: exerFcn(x))\n",
    "\n",
    "#t1['STRENGTH'] = t1['STRENGTH'].map(lambda x: exerFcn(x))\n",
    "\n",
    "choice = {'1':'Yes', '2':'No', '7':'Don\\'t know' , '9': 'Refused'}\n",
    "t1['EXERANY2'] = t1['EXERANY2'].replace(choice)\n",
    "\n",
    "choice={'1': 'Had exercise in last 30 days', \n",
    "        '2': 'No exercise in last 30 days', \n",
    "        '9': 'Don’t know/Not sure/Missing'}\n",
    "\n",
    "t1['_TOTINDA'] = t1['_TOTINDA'].replace(choice)\n",
    "\n",
    "\n",
    "choice = { '0' : 'Not Moderate / Vigorous or No Activity', \n",
    "           '1' : 'Moderate',\n",
    "           '2' : 'Vigorous'}\n",
    "\n",
    "t1['ACTIN11_'] = t1['ACTIN11_'].replace(choice)\n",
    "t1['ACTIN21_'] = t1['ACTIN21_'].replace(choice)\n",
    "\n",
    "\n",
    "choice = {'1' : 'Highly Active',\n",
    "          '2' : 'Active', \n",
    "          '3' : 'Insufficiently Active', \n",
    "          '4' : 'Inactive', \n",
    "          '9' : 'Don’t know' }\n",
    "\n",
    "t1['_PACAT1'] = t1['_PACAT1'].replace(choice)\n",
    "\n",
    "choice = {'1' : 'Met aerobic recommendations', \n",
    "          '2' : 'Did not meet aerobic recommendations', \n",
    "          '9' : 'Don’t know' }\n",
    "\n",
    "t1['_PAINDX1'] = t1['_PAINDX1'].replace(choice)\n",
    "\n",
    "choice = {'1' : 'Meet muscle strengthening recommendations',\n",
    "          '2' : 'Did not meet muscle strengthening recommendations',\n",
    "          '9' : 'Missing'}\n",
    "\n",
    "t1['_PASTRNG'] = t1['_PASTRNG'].replace(choice)\n",
    "\n",
    "choice = {'1' : 'Met both guidelines', \n",
    "          '2' : 'Met aerobic guidelines only', \n",
    "          '3' : 'Met strengthening guidelines only', \n",
    "          '4' : 'Did not meet either guideline', \n",
    "          '9' : 'Missing' }\n",
    "\n",
    "t1['_PAREC1'] = t1['_PAREC1'].replace(choice)\n",
    "\n",
    "\n",
    "choice = {'1' : 'Met both guidelines',  \n",
    "          '2' : 'Did not meet both guideline', \n",
    "          '9' : 'Missing' }\n",
    "\n",
    "#t1['_PASTAE1'] = t1['_PASTAE1'].replace(choice)\n",
    "\n",
    "# Map activity code to activity names\n",
    "act = pd.read_csv('./activity.csv', encoding='iso-8859-1')\n",
    "act['Activity'] = act['Activity'].map(lambda x: re.sub(r'\\s*$','', x))\n",
    "\n",
    "t1['EXRACT11'] = t1['EXRACT11'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['EXRACT11'] = t1['EXRACT11'].replace(act.set_index('ID').to_dict()['Activity'])\n",
    "\n",
    "t1['EXRACT21'] = t1['EXRACT21'].map(lambda x: int(x) if not str.isspace(x) else '')\n",
    "t1['EXRACT21'] = t1['EXRACT21'].replace(act.set_index('ID').to_dict()['Activity'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Height, Weight, Age\n",
    "\n",
    "# BMI\n",
    "t1['_BMI5'] = t1['_BMI5'].map(lambda x: int(x) if not str.isspace(x) else None)/100\n",
    "\n",
    "choice={'1': 'Underweight', \n",
    "        '2': 'Normal weight', \n",
    "        '3': 'Overweight',\n",
    "        '4':'Obese'}\n",
    "\n",
    "t1['_BMI5CAT'] = t1['_BMI5CAT'].replace(choice)\n",
    "\n",
    "# Height & Weight\n",
    "t1['WTKG3'] = t1['WTKG3'].map(lambda x: int(x) if not str.isspace(x) else None)/100\n",
    "t1['HTM4'] = t1['HTM4'].map(lambda x: int(x) if not str.isspace(x) else None)/100\n",
    "t1['HTIN4'] = t1['HTIN4'].map(lambda x: int(x) if not str.isspace(x) else None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Nutrition\n",
    "## NOTE:  Values include two implied decimal places \n",
    "# Vegetable & Fruit intake per day\n",
    "t1['_FRUTSUM'] = t1['_FRUTSUM'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['_VEGESUM'] = t1['_VEGESUM'].map(lambda x: int(x) if not str.isspace(x) else None) \n",
    "\n",
    "# Food intake - times per day\n",
    "t1['FRUTDA1_'] = t1['FRUTDA1_'].map(lambda x: int(x) if not str.isspace(x) else None) \n",
    "t1['VEGEDA1_'] = t1['VEGEDA1_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['GRENDAY_'] = t1['GRENDAY_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['ORNGDAY_'] = t1['ORNGDAY_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['FTJUDA1_'] = t1['FTJUDA1_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['BEANDAY_'] = t1['BEANDAY_'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "\n",
    "# Salt intake and advice\n",
    "choice = {'1':'Yes', '2':'No', '7':'Don\\'t know' , '9': 'Refused'}\n",
    "t1['WTCHSALT'] = t1['WTCHSALT'].replace(choice)\n",
    "t1['DRADVISE'] = t1['DRADVISE'].replace(choice)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Demographics\n",
    "choice = {'1' : 'Did not graduate High School', \n",
    "          '2' : 'Graduated High School',\n",
    "          '3' : 'Attended College or Technical School', \n",
    "          '4' : 'Graduated from College or Technical School', \n",
    "          '9' : 'Don’t know/Not sure/Missing'}\n",
    "\n",
    "t1['_EDUCAG'] = t1['_EDUCAG'].replace(choice)\n",
    "\n",
    "choice = {'1' : 'Male', \n",
    "          '2' : 'Female'}\n",
    "t1['SEX'] = t1['SEX'].replace(choice)\n",
    "\n",
    "choice = {'1' : '< $15000', \n",
    "          '2' : '$15,000 - $25,000',\n",
    "          '3' : '$25,000 - $35,000', \n",
    "          '4' : '$35,000 - $50,000',\n",
    "          '5' : '> $50,000', \n",
    "          '9' : 'Don’t know/Not sure/Missing'}\n",
    "\n",
    "t1['_INCOMG'] = t1['_INCOMG'].replace(choice)\n",
    "\n",
    "choice = {'1':'Employed for wages', '2':'Self-employed', '3': 'Unemployed < 1 year', '4': 'Unemployed > 1 year', '5': 'Homemaker', '6' : 'Student', '7': 'Retired' , '8': 'Unable to work', '9': 'Refused'}\n",
    "t1['EMPLOY1'] = t1['EMPLOY1'].replace(choice)\n",
    "\n",
    "choice = {'1':'< Kindergarden', '2':'Elementary', '3': 'Some high-school', '4': 'High-school graduate', '5': 'College / tech school', '6' : 'College grade', '9': 'Refused'}\n",
    "t1['EDUCA'] = t1['EDUCA'].replace(choice)\n",
    "\n",
    "choice = {'1':'Married', '2':'Divored', '4': 'Separated', '3': 'Separated',  '5': 'Never Married', '6':'Unmarried couple' , '9': 'Refused'}\n",
    "t1['MARITAL'] = t1['MARITAL'].replace(choice)\n",
    "\n",
    "choice = {'1':'Yes', '2':'No', '7':'Don\\'t know' , '9': 'Refused'}\n",
    "t1['VETERAN3'] = t1['VETERAN3'].replace(choice)\n",
    "\n",
    "# Age\n",
    "choice = {\n",
    "'01' : 'Age 18 to 24',\n",
    "    '02' : 'Age 25 to 29', \n",
    "    '03' : 'Age 30 to 34', \n",
    "    '04' : 'Age 35 to 39', \n",
    "    '05': 'Age 40 to 44', \n",
    "    '06' : 'Age 45 to 49', \n",
    "    '07':  'Age 50 to 54', \n",
    "    '08':  'Age 55 to 59', \n",
    "    '09': 'Age 60 to 64', \n",
    "    '10':  'Age 65 to 69', \n",
    "    '11': 'Age 70 to 74', \n",
    "    '12': 'Age 75 to 79' , \n",
    "    '13':  'Age 80 or older', \n",
    "    '14':  'Don’t know/Refused/Missing'}\n",
    "\n",
    "\n",
    "t1['_AGEG5YR'] = t1['_AGEG5YR'].replace(choice)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# General health\n",
    "choice = {'5':'Poor', '3':'Good', '1':'Excellent', '2':'Very Good', '4':'Fair', '7':'Don\\'t know' , '9': 'Refused'}\n",
    "t1['GENHLTH'] = t1['GENHLTH'].replace(choice)\n",
    "\n",
    "choice = {'1':'Yes', '2':'No', '7':'Don\\'t know' , '9': 'Refused'}\n",
    "t1['QLACTLM2'] = t1['QLACTLM2'].replace(choice)\n",
    "t1['USEEQUIP'] = t1['USEEQUIP'].replace(choice)\n",
    "t1['DECIDE']   = t1['DECIDE'].replace(choice)\n",
    "t1['DIFFWALK'] = t1['DIFFWALK'].replace(choice)\n",
    "t1['DIFFDRES'] = t1['DIFFDRES'].replace(choice)\n",
    "t1['DIFFALON'] = t1['DIFFALON'].replace(choice)\n",
    "\n",
    "\n",
    "t1['MENTHLTH'] = t1['MENTHLTH'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['POORHLTH'] = t1['POORHLTH'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['SLEPTIM1'] = t1['SLEPTIM1'].map(lambda x: int(x) if not str.isspace(x) else None)\n",
    "t1['PHYSHLTH'] = t1['PHYSHLTH'].map(lambda x: int(x) if not str.isspace(x) else None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Map variable names to more descriptive names\n",
    "varDict = var[['Variable Name', 'DESC']].to_dict('split')\n",
    "varDict = dict(varDict['data'])\n",
    "t1.rename(columns=varDict, inplace=True)\n",
    "\n",
    "# Replace space / special characters with underscore\n",
    "t1.rename(columns=lambda x: re.sub(' ', '_', x), inplace=True)\n",
    "t1.rename(columns=lambda x: re.sub(r'\\(|\\-|\\/|\\|\\>|\\)|\\#', '', x), inplace=True)\n",
    "t1.rename(columns=lambda x: re.sub(r'\\>', 'GT', x), inplace=True)\n",
    "\n",
    "# Delete original row\n",
    "del(t1['Var'])\n",
    "\n",
    "t1.fillna('', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### Create and configure Elasticsearch index\n",
    "\n",
    "# Name of index and document type\n",
    "index_name = 'brfss';\n",
    "doc_name = 'respondent'\n",
    "\n",
    "# Delete donorschoose index if one does exist\n",
    "if es.indices.exists(index_name):\n",
    "    es.indices.delete(index_name)\n",
    "\n",
    "# Create donorschoose index\n",
    "es.indices.create(index_name)\n",
    "\n",
    "# Add mapping\n",
    "with open('brfss_mapping.json') as json_mapping:\n",
    "    d = json.load(json_mapping)\n",
    "\n",
    "es.indices.put_mapping(index=index_name, doc_type=doc_name, body=d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "### Index Data into Elasticsearch\n",
    "\n",
    "for subj_id, subject in t1.iterrows():\n",
    "    if subj_id % 1000 == 0:\n",
    "        print(subj_id)\n",
    "    thisResp = subject.to_dict()\n",
    "    thisResp['Coordinates'] = [thisResp['Longitude'], thisResp['Latitude']]\n",
    "    thisDoc = json.dumps(thisResp);\n",
    "    #pprint.pprint(thisDoc)\n",
    "\n",
    "    # write to elasticsearch\n",
    "    es.index(index=index_name, doc_type=doc_name, id=subj_id, body=thisDoc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3.0
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}